Create density plots for all correlation datasets (fly genetic interaction profile correlation, fly morphology feature correlation, yeast genetic interaction profile similarity, human gene effect correlation), showing shift of density between internal correlations and external correlations.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
## Annotation files ##
# Load list of orthologue pairs from Ensembl database -27.11.2019
orthologues <- read_delim("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/new_orthologues_list_copy.txt",delim = "\t")
## Parsed with column specification:
## cols(
## Gene_stable_ID = col_character(),
## S_cerevisiae_gene_stable_ID = col_character(),
## S_cerevisiae_gene_name = col_character(),
## percentage_id_query_gene_identical_to_target_S_cerevisiae_gene = col_double(),
## S_cerevisiae_orthology_confidence_0low_1high = col_double(),
## Human_gene_stable_ID = col_character(),
## Human_gene_name = col_character(),
## Human_orthology_confidence_0low_1high = col_double(),
## percentage_id_query_gene_identical_to_target_Human_gene = col_double(),
## Gene_name = col_character()
## )
# Load complex annotations table (fly) - copied from Maria's folder
fly_complex_annotations <- read_delim("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/complex_annotations(1).txt", delim = "\t") %>%
rename(Gene_name = genes)
## Parsed with column specification:
## cols(
## genes = col_character(),
## complex = col_character()
## )
## Correlation files ##
# Genetic interaction profile correlation for Fly (based on cell count), (Florian, 29.01.2020):
load(file = "/Users/b110/Desktop/VeraPeters/conserved_functional_groups/cell_count_interaction_correlations_pearson_p_unfiltered_fly.RData")
# This file is named GI_corrs,
# In following markdown, will be referred to as cellcount_corrs
# Morphology feature correlation, (Florian, 03.02.2020):
load(file = "/Users/b110/Desktop/VeraPeters/conserved_functional_groups/feature_correlations_pearson_p_unfiltered_fly.RData")
# This file is named feature_corrs
# Genetic interaction profile similarity values for Yeast:
# Read genetic interaction profile similarity values (yeast, https://thecellmap.org/costanzo2016/ - downloaded Jan 27, 2020 (Data File S3))):
# Description: Matrix file containing genetic interaction profile similarity values (as measured by Pearson correlation) for every pair of mutant strains in the dataset. Similarity values were computed for all gene pairs combined (ALL). The matrix contains 2 sets of row and column headers, providing a unique allele name for every mutant strain (row & column header #1) as well as a systematic ORF name (row & column header #2).
yeast_correlation <- read_delim("/Users/b110/Desktop/VeraPeters/source_data/costanzo_2016/Data File S3. Genetic interaction profile similarity matrices/cc_ALL.txt", delim = "\t")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
# Load human dataset
# Load dataset filtered for genes in the orthologues list (created in chunk 4 of cons_functional_groups_human markdown):
ceres_corrs_sep_in_ortho <- read_csv(file = "/Users/b110/Desktop/VeraPeters/conserved_functional_groups/ceres_corrs_sep_filtered.csv")
## Parsed with column specification:
## cols(
## x = col_character(),
## xnum = col_character(),
## y = col_character(),
## ynum = col_character(),
## r = col_double()
## )
# Add complex annotations to orthologues dataset, keep only rows with complex annotation -> inner_join
orthologues_annotations <- inner_join(orthologues, fly_complex_annotations, by = "Gene_name")
# Convert S_cerevisiae_gene_name to lowercase letters (cause they are lowercase in the yeast correlation dataset)
orthologues_annotations <- orthologues_annotations %>%
mutate(S_cerevisiae_gene_name = tolower(S_cerevisiae_gene_name))
The following chunks are tidying and filtering the four datasets. Goal: keep only values for genes which are in the orthologues list and have a complex annotation
# Use Florian's code to separate gene and IDs into single columns:
# For cell count-based data:
fly_correlation_cellcount <- GI_corrs %>%
separate(x,c("genex","idx"),sep = "öö")%>%
separate(y,c("geney","idy"),sep = "öö") %>%
dplyr::select(genex,geney,idx,idy,r)
dim(fly_correlation_cellcount)
## [1] 23526370 5
# [1] 23526370 5
# For feature-based data:
fly_correlation_features <- feature_corrs %>%
separate(x,c("genex","idx"),sep = "öö")%>%
separate(y,c("geney","idy"),sep = "öö") %>%
dplyr::select(genex,geney,idx,idy,r)
dim(fly_correlation_features)
## [1] 23526370 5
# [1] 23526370 5
# Some of the correlations correspond to wells without dsRNA or with only one dsRNA?
# In those columns, the gene is marked with "-" -> I'm gonna filter out lines in which genex or geney equals "-"
fly_correlation_cellcount <- fly_correlation_cellcount %>%
filter(genex != "-", geney != "-")
dim(fly_correlation_cellcount)
## [1] 23498940 5
# [1] 23498940 5
fly_correlation_features <- fly_correlation_features %>%
filter(genex != "-", geney != "-")
dim(fly_correlation_features)
## [1] 23498940 5
# [1] 23498940 5
# Filter datasets for genes present in orthologues list - I want both of the genes to be in the orthologues list
fly_corr_cellcount_in_ortho <- fly_correlation_cellcount %>%
filter(genex %in% orthologues_annotations$Gene_name & geney %in% orthologues_annotations$Gene_name)
dim(fly_corr_cellcount_in_ortho)
## [1] 67528 5
# [1] 67528 5
fly_corr_features_in_ortho <- fly_correlation_features %>%
filter(genex %in% orthologues_annotations$Gene_name & geney %in% orthologues_annotations$Gene_name)
dim(fly_corr_features_in_ortho)
## [1] 67528 5
# [1] 67528 5
# Create new columns in those two datasets (complex_1 and complex_2), by left-joining with the orthologues_annotations list and add column "type"
# Cell count-based data:
fly_corr_cellcount_anno <- fly_corr_cellcount_in_ortho %>%
rename(Gene_name = genex) %>%
left_join((orthologues_annotations %>% select(Gene_name, complex) %>% drop_na() %>% distinct()), by = "Gene_name") %>%
rename(Gene_1 = Gene_name, Complex_1 = complex) %>%
rename(Gene_name = geney) %>%
left_join((orthologues_annotations %>% select(Gene_name, complex) %>% drop_na() %>% distinct()), by = "Gene_name") %>%
rename(Gene_2 = Gene_name, Complex_2 = complex) %>%
mutate(type = ifelse(Complex_1 == Complex_2, "internal", "external"))
dim(fly_corr_cellcount_anno)
## [1] 67528 8
# [1] 67528 8 <- same number of rows as before
# Feature-based data:
fly_corr_features_anno <- fly_corr_features_in_ortho %>%
rename(Gene_name = genex) %>%
left_join((orthologues_annotations %>% select(Gene_name, complex) %>% drop_na() %>% distinct()), by = "Gene_name") %>%
rename(Gene_1 = Gene_name, Complex_1 = complex) %>%
rename(Gene_name = geney) %>%
left_join((orthologues_annotations %>% select(Gene_name, complex) %>% drop_na() %>% distinct()), by = "Gene_name") %>%
rename(Gene_2 = Gene_name, Complex_2 = complex) %>%
mutate(type = ifelse(Complex_1 == Complex_2, "internal", "external"))
dim(fly_corr_features_anno)
## [1] 67528 8
# [1] 67528 8 <- same number of rows as before
# Let's see if the columns for genes and complexes in the two dataframes are identical, so I could create a common dataframe with both r values
identical(fly_corr_cellcount_anno$Gene_1, fly_corr_features_anno$Gene_1)
## [1] TRUE
identical(fly_corr_cellcount_anno$Gene_2, fly_corr_features_anno$Gene_2)
## [1] TRUE
identical(fly_corr_cellcount_anno$Complex_1, fly_corr_features_anno$Complex_1)
## [1] TRUE
identical(fly_corr_cellcount_anno$Complex_2, fly_corr_features_anno$Complex_2)
## [1] TRUE
identical(fly_corr_cellcount_anno$type, fly_corr_features_anno$type)
## [1] TRUE
# returs TRUE for all of them
# Create common dataframe
fly_corr_common_anno <- fly_corr_cellcount_anno %>%
rename(r_cellcount = r) %>% # needed to avoid columns with the same name
select(-Complex_1, -Complex_2, -type) %>% # I don't need those (they are identical to the features dataframe)
left_join(fly_corr_features_anno, by = c("Gene_1", "Gene_2", "idx", "idy")) %>% # combine both tables (all four columns are needed in "by" to avoid duplications)
select(-idx, -idy) %>% # Now I don't need the IDs anymore I think
rename(r_features = r) %>% # rename the second r column to clarify which belongs to which dataset
select(Gene_1, Gene_2, Complex_1, Complex_2, type, r_cellcount, r_features) # re-order the columns
dim(fly_corr_common_anno)
## [1] 67528 7
# [1] 67528 7 # same number of rows as the original dataframes
dim(fly_corr_common_anno %>% distinct())
## [1] 67528 7
# [1] 67528 7 # no duplications
# Fly dataset (containing both cell count- as well as feature- based correlations) with genes present in orthologues list and complex annotations
# Drop ORF columns
dim(yeast_correlation)
## [1] 6231 6232
# [1] 6231 6232
yeast_correlation <- yeast_correlation %>%
rename(allele_name = X1) %>% # rename column for allele
select(-X2) %>% # drop column X2 (= column containing ORF names)
drop_na() # there is only one row containing NAs which is the first row, which has the ORF names, so I'll just use the drop_na() function to get rid of the row
dim(yeast_correlation)
## [1] 6230 6231
# 6230 6231 <- one column and one row less than before
# There are duplicates (due to the fact that the matrix is symmetrical), for example, there is gene_name_1 aap1 and gene_name_2 aat2, as well as gene_name_1 aat2 and gene_name_2 aap1
# example:
# aat2 NA aap1 NA -0.05033
# aap1 NA aat2 NA -0.05033
# Set column allele_names as rownames
yeast_correlation_rownames <- yeast_correlation %>%
column_to_rownames(var = "allele_name")
# First, I need to check whether the column names and row names are in the same order
identical(rownames(yeast_correlation_rownames), colnames(yeast_correlation_rownames))
## [1] TRUE
# [1] TRUE # fortunately, they are
# Replace upper triangle of the matrix with NA, including the diagonale
yeast_correlation_rownames[upper.tri(yeast_correlation_rownames, diag=TRUE)] <- NA
# Convert rownames back to column
yeast_correlation <- yeast_correlation_rownames%>%
rownames_to_column(var = "allele_name")
# Gather
yeast_correlation_gathered <- yeast_correlation %>%
gather(allele_name_2, value, -allele_name)
dim(yeast_correlation_gathered)
## [1] 38812900 3
#[1] 38812900 3
# Drop the NAs. Additionally, for some allele combinations, the value is "NaN"" - I cannot use those, so I'll filter them out
yeast_correlation_gathered <- yeast_correlation_gathered %>%
drop_na() %>%
filter(value != "NaN", value != "NA") # for whatever reason, the drop_na() did not work, but filtering for the string "NA" did
dim(yeast_correlation_gathered)
## [1] 17516577 3
# [1] 17516577 3
# Less than half of the rows. makes sense, because diagonal is excluded
# Separate allele_name into gene name and allele name:
# Using merge (allows only as many separations as number of new columns I give):
yeast_correlation_gathered_sep <- yeast_correlation_gathered %>%
separate(allele_name, into = c("gene_name_1", "allele_name_1"), sep = "-", extra = "merge") %>%
separate(allele_name_2, into = c("gene_name_2", "allele_name_2"), sep = "-", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 13078705 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12930154 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Filter yeast dataset for genes present in the orthologues list - I want both of the genes to be in the orthologues list
yeast_corr_in_ortho <- yeast_correlation_gathered_sep %>%
filter(gene_name_1 %in% orthologues_annotations$S_cerevisiae_gene_name & gene_name_2 %in% orthologues_annotations$S_cerevisiae_gene_name)
dim(yeast_corr_in_ortho)
## [1] 20795 5
# [1] 20795 5
# Create new columns in this dataset (complex_1 and complex_2), by left-joining with the orthologues_annotations list and add column "type"
yeast_corr_anno_multiple <- yeast_corr_in_ortho %>%
rename(S_cerevisiae_gene_name = gene_name_1) %>%
left_join((orthologues_annotations %>% select(S_cerevisiae_gene_name, complex) %>% drop_na() %>% distinct()), by = "S_cerevisiae_gene_name") %>%
rename(gene_name_1 = S_cerevisiae_gene_name, Complex_1 = complex) %>%
rename(S_cerevisiae_gene_name = gene_name_2) %>%
left_join((orthologues_annotations %>% select(S_cerevisiae_gene_name, complex) %>% drop_na() %>% distinct()), by = "S_cerevisiae_gene_name") %>%
rename(gene_name_2 = S_cerevisiae_gene_name, Complex_2 = complex) %>%
mutate(type = ifelse(Complex_1 == Complex_2, "internal", "external"))
dim(yeast_corr_anno_multiple)
## [1] 21829 8
# [1] 21829 8
# There are more rows than before, probably because there are genes that have multiple complex annotations
nrow(orthologues_annotations %>% select(S_cerevisiae_gene_name, complex) %>% drop_na() %>% distinct())
## [1] 180
# [1] 180
nrow(orthologues_annotations %>% select(S_cerevisiae_gene_name) %>% drop_na() %>% distinct())
## [1] 176
# [1] 176
# 4 more complex annotations than unique gene names
# Which genes have more than one complex annotation?
orthologues_yeast_morethan1 <- orthologues_annotations %>%
select(S_cerevisiae_gene_name, complex) %>%
drop_na() %>%
distinct() %>%
group_by(S_cerevisiae_gene_name) %>%
filter(n() > 1) %>%
count()
# Groups: S_cerevisiae_gene_name [4]
# S_cerevisiae_gene_name n
# <chr> <int>
#1 rpn3 2
#2 rpn5 2
#3 rpn6 2
#4 taf14 2
# Create dataset based on unique gene - complex annotations by using highest percentage identity
# Create new "human sub-dataframe" of orthologues list
orthologues_annotations_yeast <- orthologues_annotations %>%
select(S_cerevisiae_gene_name, percentage_id_query_gene_identical_to_target_S_cerevisiae_gene, Gene_name, complex) %>%
drop_na()
orthologues_annotations_yeast <- orthologues_annotations_yeast %>%
group_by(S_cerevisiae_gene_name) %>%
arrange(desc(percentage_id_query_gene_identical_to_target_S_cerevisiae_gene)) %>% # arrange percentage identical values (decreasing)
do(head(.,1)) %>% # keep only the row with the highest percentage identical value
ungroup()
dim(orthologues_annotations_yeast)
## [1] 176 4
# [1] 176 6
# Create new columns in those two datasets (complex_1 and complex_2), by left-joining with the orthologues_annotations list and add column "type"
yeast_corr_anno_unique <- yeast_corr_in_ortho %>%
rename(S_cerevisiae_gene_name = gene_name_1) %>%
left_join((orthologues_annotations_yeast %>% select(S_cerevisiae_gene_name, complex) %>% drop_na() %>% distinct()), by = "S_cerevisiae_gene_name") %>%
rename(gene_name_1 = S_cerevisiae_gene_name, Complex_1 = complex) %>%
rename(S_cerevisiae_gene_name = gene_name_2) %>%
left_join((orthologues_annotations_yeast %>% select(S_cerevisiae_gene_name, complex) %>% drop_na() %>% distinct()), by = "S_cerevisiae_gene_name") %>%
rename(gene_name_2 = S_cerevisiae_gene_name, Complex_2 = complex) %>%
mutate(type = ifelse(Complex_1 == Complex_2, "internal", "external"))
dim(yeast_corr_anno_unique)
## [1] 20795 8
# [1] 20795 8 <- same numbers of rows as before adding the complex annotation
# Now I've got two yeast datasets; both contain only genes present in the orthologues list. One of them has multiple complex annotations for one gene, the other one has a unique complex annotation for each gene
# I'm gonna use the unfiltered one (multiple) from here on, because otherwise I lose the n >= 4 requirement for COP9
# The human dataset was already filtered to contain only genes present in the orthologues list (see cons_functional_groups_human markdown)
# But: orthologues list was shortened by adding complex annotations (inner_join) -> a second filtering is necessary to keep only genes that have a complex annotation
dim(ceres_corrs_sep_in_ortho)
## [1] 37208251 5
# [1] 37208251 5
# Filter for genes present in orthologues_annotations list:
ceres_corrs_sep_in_ortho_anno <- ceres_corrs_sep_in_ortho %>%
select(-xnum, -ynum) %>%
filter(x %in% orthologues_annotations$Human_gene_name & y %in% orthologues_annotations$Human_gene_name)
dim(ceres_corrs_sep_in_ortho_anno)
## [1] 173166 3
# [1] 173166 3
# Add columns complex_x and complex_y, and type:
human_corr_anno <- ceres_corrs_sep_in_ortho_anno %>%
rename(Human_gene_name = x) %>% # rename to enable join by
left_join((orthologues_annotations %>% select(Human_gene_name, complex) %>% drop_na() %>% distinct()), by = "Human_gene_name") %>% # join complex annotations first gene
rename(Gene_1 = Human_gene_name, Complex_1 = complex) %>% # rename to avoid duplications
rename(Human_gene_name = y) %>% # rename to enable join by
left_join((orthologues_annotations %>% select(Human_gene_name, complex) %>% drop_na() %>% distinct()), by = "Human_gene_name") %>% # join complex annotations second gene
rename(Gene_2 = Human_gene_name, Complex_2 = complex) %>% # rename to naming consistent
mutate(type = ifelse(Complex_1 == Complex_2, "internal", "external")) # add column "type" to specify internal vs. external
dim(human_corr_anno)
## [1] 173166 6
# 173166 6
# Human dataset with genes present in orthologues list and complex annotations
# Create vectors containing complexes with n >= 4 genes for all four (three, considering that fly has same genes twice) datasets and for both gene name columns
C1_n4_fly <- (fly_corr_common_anno %>%
select(Gene_1, Complex_1) %>%
distinct() %>%
group_by(Complex_1) %>%
filter(n() >= 4) %>%
count() %>%
arrange(Complex_1))$Complex_1
C2_n4_fly <- (fly_corr_common_anno %>%
select(Gene_2, Complex_2) %>%
distinct() %>%
group_by(Complex_2) %>%
filter(n() >= 4) %>%
count() %>%
arrange(Complex_2))$Complex_2
C1_n4_yeast <- (yeast_corr_anno_multiple %>%
select(gene_name_1, Complex_1) %>%
distinct() %>%
group_by(Complex_1) %>%
filter(n() >= 4) %>%
count() %>%
arrange(Complex_1))$Complex_1
C2_n4_yeast <- (yeast_corr_anno_multiple %>%
select(gene_name_2, Complex_2) %>%
distinct() %>%
group_by(Complex_2) %>%
filter(n() >= 4) %>%
count() %>%
arrange(Complex_2))$Complex_2
C1_n4_human <- (human_corr_anno %>%
select(Gene_1, Complex_1) %>%
distinct() %>%
group_by(Complex_1) %>%
filter(n () >= 4) %>%
count() %>%
arrange(Complex_1))$Complex_1
C2_n4_human <- (human_corr_anno %>%
select(Gene_2, Complex_2) %>%
distinct() %>%
group_by(Complex_2) %>%
filter(n () >= 4) %>%
count() %>%
arrange(Complex_2))$Complex_2
# Check length of vectors, choose shortest one:
length(C1_n4_fly)
## [1] 27
length(C2_n4_fly)
## [1] 27
length(C1_n4_yeast)
## [1] 21
length(C2_n4_yeast)
## [1] 21
length(C1_n4_human)
## [1] 32
length(C2_n4_human)
## [1] 32
# The two yeast vectors are the shortest ones (21)
# Compare if they are the same
identical(C1_n4_yeast, C2_n4_yeast)
## [1] TRUE
# [1] TRUE
# Check, if all of these 20 complexes in yeast have at least 4 genes in the other organisms
which(! C1_n4_yeast %in% C1_n4_human)
## integer(0)
# integer(0)
which(! C1_n4_yeast %in% C2_n4_human)
## integer(0)
# integer(0)
# -> All complexes that have >= 4 genes in yeast also have >= 4 genes in fly
which(! C1_n4_yeast %in% C1_n4_fly)
## [1] 15
# [1] 15
which(! C1_n4_yeast %in% C2_n4_fly)
## [1] 15
# [1] 15
# -> There is one complex with >= 4 genes in yeast, that does not have >= 4 genes in fly
# Which one?
C1_n4_yeast[15]
## [1] "PFDN"
# [1] "PFDN"
# Create new vector with 20 complexes that have n >= 4 in all datasets
complexes_n4_vec <- C1_n4_yeast[-15]
complexes_n4_vec
## [1] "AP" "APC" "BAPPBAP" "COMP" "COP9" "COPII" "ESCRT"
## [8] "GATOR" "HSP60" "IlR" "IlRNeg" "MED" "OXPHOS" "PATPASE"
## [15] "PROT" "RAS" "RNAPII" "TFIID" "TIP60" "VPSC"
#[1] "AP" "APC" "BAPPBAP" "COMP" "COP9" "COPII" "ESCRT" "GATOR" "HSP60" "IlR" "IlRNeg" "MED" "OXPHOS" "PATPASE" "PROT" "RAS" "RNAPII"
#[18] "TFIID" "TIP60" "VPSC"
# Fly dataset:
fly_corr_common_anno_n4 <- fly_corr_common_anno %>%
filter(Complex_1 %in% complexes_n4_vec & Complex_2 %in% complexes_n4_vec)
# Yeast dataset:
yeast_corr_anno_multiple_n4 <- yeast_corr_anno_multiple %>%
filter(Complex_1 %in% complexes_n4_vec & Complex_2 %in% complexes_n4_vec)
# Human dataset:
human_corr_anno_n4 <- human_corr_anno %>%
filter(Complex_1 %in% complexes_n4_vec & Complex_2 %in% complexes_n4_vec)
# Calculate p-values of internal vs. external shift, add column with rounded p-values or "< 0.00001" to improve visualization later
# Calculate p-values for fly dataset:
tt_fly <- data.frame(matrix(ncol = 3, nrow = length(complexes_n4_vec))) %>%
rename(complex = X1, pval_cellcount = X2, pval_features = X3)
j <- 1
for (i in (complexes_n4_vec)) {
tt_fly$complex[j] <- as.character(i)
tt_fly$pval_cellcount[j] <- t.test(formula = r_cellcount ~type, alternative = c("two.sided"), data = fly_corr_common_anno_n4 %>% filter(Complex_1 == i | Complex_2 == i))$p.value
tt_fly$pval_features[j] <- t.test(formula = r_features ~type, alternative = c("two.sided"), data = fly_corr_common_anno_n4 %>% filter(Complex_1 == i | Complex_2 == i))$p.value
j <- j+1
}
tt_fly <- tt_fly %>%
mutate(pvalround_cellcount = ifelse(pval_cellcount < 0.00001, "< 0.00001", as.character(paste0 ("= ",round(pval_cellcount, digits = 5))))) %>%
mutate(pvalround_features = ifelse(pval_features < 0.00001, "< 0.00001", as.character(paste0 ("= ",round(pval_features, digits = 5)))))
# Calculate p-values for yeast dataset:
tt_yeast <- data.frame(matrix(ncol = 2, nrow = length(complexes_n4_vec))) %>%
rename(complex = X1, pval = X2)
j <- 1
for (i in (complexes_n4_vec)) {
tt_yeast$complex[j] <- as.character(i)
tt_yeast$pval[j] <- t.test(formula = as.numeric(value) ~type, alternative = c("two.sided"), data = yeast_corr_anno_multiple_n4 %>% filter(Complex_1 == i | Complex_2 == i))$p.value
j <- j+1
}
tt_yeast <- tt_yeast %>%
mutate(pvalround = ifelse(pval < 0.00001, "< 0.00001", as.character(paste0 ("= ",round(pval, digits = 5))))) # round pvalue/set to < x, to make display easier
# Calculate p-values for human dataset:
tt_human <- data.frame(matrix(ncol = 2, nrow = length(complexes_n4_vec))) %>%
rename(complex = X1, pval = X2)
j <- 1
for (i in (complexes_n4_vec)) {
tt_human$complex[j] <- as.character(i)
tt_human$pval[j] <- t.test(formula = r ~type, alternative = c("two.sided"), data = human_corr_anno_n4 %>% filter(Complex_1 == i | Complex_2 == i))$p.value
j <- j+1
}
tt_human <- tt_human %>%
mutate(pvalround = ifelse(pval < 0.00001, "< 0.00001", as.character(paste0 ("= ",round(pval, digits = 5))))) # round pvalue/set to < x, to make display easier
# Define b110 theme:
theme_b110<-function(){
theme_classic() +
theme(
axis.text=element_text(size = 16),
axis.title=element_text(size = 16),
plot.title = element_text(size = 22,hjust = 0.5,face="bold"),
legend.title = element_text(size = 22),
legend.text = element_text(size =16),
legend.position = "bottom"
)
}
# Fly (cellcount):
for (i in (complexes_n4_vec)) {
plt <- fly_corr_common_anno_n4 %>%
filter(Complex_1 == i | Complex_2 == i) %>%
ggplot(aes(x = as.numeric(r_cellcount), fill = type)) +
geom_density(alpha = 0.3) +
scale_fill_manual(values = c("grey", "#4285f4")) +
theme_b110() +
xlab("pearson correlation") +
ylab("density") +
geom_vline(xintercept = 0, color = "black", linetype ="dashed") +
scale_y_continuous(expand = c(0,0)) +
ggtitle(paste0(" Distribution of pearson correlations, complex ", as.character(i), ", \n fly (genetic interaction profile based on cell count)"))
plt +
geom_text(aes(label = paste0("p ", (tt_fly %>% filter(complex == i))$pvalround_cellcount)), x = layer_scales(plt)$x$range$range[2]/1.5, y = layer_scales(plt)$y$range$range[2]-layer_scales(plt)$y$range$range[2]/50, size = 5)
print(plt)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/density_fly_cellcount_", i, ".pdf"), width = 9, height = 9)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/density_fly_cellcount_", i, ".png"), width = 9, height = 9)
}
# Fly (features)
for (i in (complexes_n4_vec)) {
plt <- fly_corr_common_anno_n4 %>%
filter(Complex_1 == i | Complex_2 == i) %>%
ggplot(aes(x = as.numeric(r_features), fill = type)) +
geom_density(alpha = 0.3) +
scale_fill_manual(values = c("grey", "#d62d20")) +
theme_b110() +
xlab("pearson correlation") +
ylab("density") +
geom_vline(xintercept = 0, color = "black", linetype ="dashed") +
scale_y_continuous(expand = c(0,0)) +
ggtitle(paste0(" Distribution of pearson correlations, complex ", as.character(i), ", \n fly (morphology features)"))
plt +
geom_text(aes(label = paste0("p ", (tt_fly %>% filter(complex == i))$pvalround_features)), x = layer_scales(plt)$x$range$range[2]/1.5, y = layer_scales(plt)$y$range$range[2]-layer_scales(plt)$y$range$range[2]/50, size = 5)
print(plt)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/density_fly_features_", i, ".pdf"), width = 9, height = 9)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/density_fly_features_", i, ".png"), width = 9, height = 9)
}
# Yeast
for (i in (complexes_n4_vec)) {
plt <- yeast_corr_anno_multiple_n4 %>%
filter(Complex_1 == i | Complex_2 == i) %>%
ggplot(aes(x = as.numeric(value), fill = type)) +
geom_density(alpha = 0.3) +
scale_fill_manual(values = c("grey", "#ffa700")) +
theme_b110() +
xlab("pearson correlation") +
ylab("density") +
geom_vline(xintercept = 0, color = "black", linetype ="dashed") +
scale_y_continuous(expand = c(0,0)) +
ggtitle(paste0(" Distribution of pearson correlations, complex ", as.character(i), ", \n yeast (genetic interaction profile similarity)"))
plt +
geom_text(aes(label = paste0("p ", (tt_yeast %>% filter(complex == i))$pvalround)), x = layer_scales(plt)$x$range$range[2]/1.5, y = layer_scales(plt)$y$range$range[2]-layer_scales(plt)$y$range$range[2]/50, size = 5)
print(plt)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/density_yeast_", i, ".pdf"), width = 9, height = 9)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/density_yeast_", i, ".png"), width = 9, height = 9)
}
# Human
for (i in (complexes_n4_vec)) {
plt <- human_corr_anno_n4 %>%
filter(Complex_1 == i | Complex_2 == i) %>%
ggplot(aes(x = as.numeric(r), fill = type)) +
geom_density(alpha = 0.3) +
scale_fill_manual(values = c("grey", "#008744")) +
theme_b110() +
xlab("pearson correlation") +
ylab("density") +
geom_vline(xintercept = 0, color = "black", linetype ="dashed") +
scale_y_continuous(expand = c(0,0)) +
ggtitle(paste0(" Distribution of pearson correlations, complex ", as.character(i), ", \n human (gene effect)"))
plt +
geom_text(aes(label = paste0("p ", (tt_human %>% filter(complex == i))$pvalround)), x = layer_scales(plt)$x$range$range[2]/1.5, y = layer_scales(plt)$y$range$range[2]-layer_scales(plt)$y$range$range[2]/50, size = 5)
print(plt)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/density_human", i, ".pdf"), width = 9, height = 9)
#ggsave(paste0("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/density_human", i, ".png"), width = 9, height = 9)
}
# Density histograms:
# Fly cellcount
ggplot(fly_corr_common_anno_n4, aes(x = r_cellcount)) +
theme_b110() +
geom_histogram(binwidth = 0.001, color = "#4285f4") +
xlab("pearson correlation") +
ggtitle(" Density histogram of pearson correlation values, \n fly (genetic interaction profile based on cell count)")
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/hist_fly_cellcount.pdf", width = 9, height = 9)
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/hist_fly_cellcount.png", width = 9, height = 9)
# Fly features
ggplot(fly_corr_common_anno_n4, aes(x = r_features)) +
theme_b110() +
geom_histogram(binwidth = 0.001, color = "#d62d20") +
xlab("pearson correlation") +
ggtitle(" Density histogram of pearson correlation values, \n fly (morphology features)")
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/hist_fly_features.pdf", width = 9, height = 9)
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/hist_fly_features.png", width = 9, height = 9)
# Yeast
ggplot(yeast_corr_anno_multiple_n4, aes(x = as.numeric(value))) +
theme_b110() +
geom_histogram(binwidth = 0.001, color = "#ffa700") +
xlab("pearson correlation") +
ggtitle(" Density histogram of pearson correlation values, \n yeast (genetic interaction profile similarity)")
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/hist_yeast.pdf", width = 9, height = 9)
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/hist_yeast.png", width = 9, height = 9)
# Human
ggplot(human_corr_anno_n4, aes(x = r)) +
theme_b110() +
geom_histogram(binwidth = 0.001, color = "#008744") +
xlab("pearson correlation") +
ggtitle(" Density histogram of pearson correlation values, \n human (gene effect)")
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_PDF/hist_human.pdf", width = 9, height = 9)
#ggsave("/Users/b110/Desktop/VeraPeters/conserved_functional_groups/Plots_png/hist_human.png", width = 9, height = 9)